library(tidyverse)
library(haven)
library(urltools)
library(rjson)
library(rvest)
library(tldextract)

set.seed(pi + 4)

get_proper_domain <- function(x) {
  urltools::suffix_extract(urltools::url_parse(x)$domain)$domain
}

news <- read_csv("../results/issue_urls.csv")

survey <- read_sav("../data/November Survey/STAN0089_OUTPUT_all.sav")

survey <- survey %>%
  mutate(
    internet_primary = case_when(
    main_news_source_2_pre == 1 ~ 1,
    main_news_source_3_pre == 1 ~ 1,
    main_news_source_4_pre == 1 ~ 1,
    main_news_source_5_pre == 1 ~ 1,
    main_news_source_6_pre == 1 ~ 1,
    main_news_source_7_pre == 1 ~ 1,
    TRUE ~ 0)
    )


# I should just get all the variables Peterson et al. or Guess have
survey <- survey %>%
  mutate(
  caseid = as.integer(caseid),
  pid = pid7_pre %>% zap_labels() %>% na_if(8),
  party = case_when(
    pid %in% 1:3 ~ "dem",
    pid %in% 5:7 ~ "rep",
    TRUE ~ "ind") %>% as_factor(),
  income = faminc_pre %>% na_if(97),
  female = gender_pre,
  age = (2016 - birthyr_pre) %>% as.integer(), # survey was administered Nov. 2016
  educ = educ_pre,
  ideology = ideo5_pre %>% na_if(6),
  clinton = case_when(
    as_factor(presvote16_post) == "Hillary Clinton" ~ 1,
    TRUE ~ 0),
  trump = case_when(
    as_factor(presvote16_post) == "Donald Trump" ~ 1,
    TRUE ~ 0),
  voted_major = clinton + trump,
  abortion = Q26_pre,
  refugees = Q27_pre,
  primary = case_when(
    Q2_pre == 1 ~ 1,
    TRUE ~ 0),
  following = Q31_pre, # similar to newsint in CCES 2016
  race = case_when(
    race_pre == 1 ~ "White",
    race_pre == 2 ~ "Black",
    race_pre == 3 ~ "Hispanic",
    race_pre == 4 ~ "Asian",
    TRUE ~ "Other"
    ) %>% as_factor(),
  weight = weight_pulse %>% as.numeric(),
  ) %>% 
  select(caseid, pid, party, income, female, age, educ,
    ideology, clinton, trump, voted_major, refugees, abortion,
    primary, following, race, weight) %>% 
  as_factor() %>% 
  mutate_if(is.factor,  fct_drop)

# need to select some columns

news <- news %>% left_join(survey, by = "caseid")

bakshy <- read_csv("../data/top500.csv") %>%
  select(domain, avg_align)
bakshy <- bakshy %>% mutate(
  domain = str_remove(domain, "www.")) %>% 
  distinct(domain, .keep_all = TRUE)

bakshy_domains <- bakshy$domain
proper_bakshy_domains <- bakshy$domain
bakshy_align <- bakshy$avg_align

find_bakshy_align <- function(url_list) {
  # the bakshy domains are not in a standard format
  # some have subdomains, and some don't
  # the plan is to check if the domain exists in that url,
  # and if there are multiple domains found choose the longest
  found <- map(url_list,
    ~ bakshy_domains[str_detect(., bakshy_domains)])

  # this is necessary to prevent cross-site domain
  # references from ruining algorithm
  for (i in 1:length(found)) {
    x <- found[[i]]
    actual_proper <- get_proper_domain(url_list[i])
    potential_proper <- get_proper_domain(x)
    found[[i]] <- x[potential_proper == actual_proper]
  }
  
  found <- map(found,
    ~ ifelse(identical(., character(0)), NA, .))
  found <- map_chr(found, ~
    ifelse(is.na(.), ., .[which.max(nchar(.))])) %>% 
    tibble(domain = .)
  found <- left_join(found, bakshy, by = "domain") %>%
    rename(bakshy_domain = domain)
  return(found)
}

news <- news %>% pull(url) %>% find_bakshy_align() %>%
  bind_cols(news, .)

news <- news %>% rename(b_align = avg_align)


news %>% write_csv("../results/merged_issues.csv")
